In [1]:
#################################################################################
#####   INF889E - Méthodes d'intelligence artificielle en bioinformatique   #####
#####             Classification de VIH par zones géographiques             #####
#################################################################################
#####   Author: Riccardo Z******                                            #####
#####   This program is partly inspired by the work presented in a class    #####
#####   workshop by Dylan Lebatteux.                                        #####
#################################################################################
In [2]:
# Import functions
import re
import joblib
import numpy as np
import pandas as pd
from os import listdir
from random import shuffle
from progressbar import ProgressBar
from Bio import SeqIO, pairwise2
from Bio.motifs import create
from sklearn import svm
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from dna_features_viewer import GraphicFeature, GraphicRecord
import matplotlib.pyplot as plt
import seaborn as sns
In [3]:
##############################################
#####        IMPORTANT VARIABLES         #####
##############################################
In [4]:
# Scope of classification: if "ALL", classify by region globaly
# If AFR, ASI, CAM, CRB, EUR, FSU, MEA, NAM, OCE or SAM, classify by country within this chosen region 
scope = "CAM"
# Access path for the FASTA files (one file for each region)
path = "../../../../data/" + "all"
# Name of trained model when saving
model_name = "cam.pkl"
# For sampling purposes: will process max n sequences for each target class
n_samples = 2000
# Classification features as sum (false) of motifs or as frequency (true) of motifs
freq = False
# Elimination step for features selection
step = 5
# Number of features to select
n_features = 100
# Train / Test split ratio
split_raito = 0.8
# Dimensions for graphs (2D or 3D)
n_components = 2
# Set maximum number of incorrect records to analyse at the end
max_incorrect = 10
# Set maximum number of correct records to compute alignment with at the end
max_correct = 1000
# Set the length k of the features based on k-mers
k = 5
In [5]:
##############################################
#####        DATA INITIALISATION         #####
##############################################
print("\n         DATA INITIALISATION         ")
print("=====================================")
         DATA INITIALISATION         
=====================================
In [6]:
# Will contain all the data rows, in the form of biopython seq_records objects
data = []
# Will contain a pair of target class -> number of data rows with this target class
targets = {}
# Process raw record label information into its annotations, then insert it into data
# To update if the label of sequences in the FASTA files changes
def add_record(record, target):
    # Initialiation of the seq_record
    header = record.id.split(".")
    record.id = header[4]
    record.name = header[3]
    record.seq = record.seq.upper()
    record.annotations = {"target": target, "subtype": header[0], "country": header[1]}
    # Add it to the data table and update the target classes dictionary
    targets[target] = targets.get(target, 0) + 1
    data.append(record)
In [7]:
# Properly fills the data table using the above function
if scope == "ALL":
    # Used to show progress
    progress = ProgressBar()
    # If scope is ALL, each filename is the name of each region used as a target class
    for filename in progress(sorted(listdir(path))):
        target = filename.split('.')[0]
        for record in SeqIO.parse(path + "/" + filename, "fasta"):
            add_record(record, target)
    print("")
else:
    # Else, countries are target classes, and the scope region is the filename
    for record in SeqIO.parse(path + "/" + scope + ".fasta", "fasta"):
        target = record.id.split(".")[1]
        add_record(record, target)
In [8]:
# Dipslay data information
print("Data information:")
print("Number of sequences:", sum(targets.values()))
print("Number of targets:", len(targets))
print("Minimum number of instances:", min(targets.values()))
print("Maximum number of instances:", max(targets.values()))

# Dipslay data summary
print("\nData summary:")
for key, value in targets.items(): 
    print("Target:", key, "| Number of sequences:", value)

# Display the first 5 samples
print("\nInformation of the first 5 samples:")
for i in range(5):
    print("ID:", data[i].id, "| Sequence:", data[i].seq[0:50], "| Annotations:", data[i].annotations)
Data information:
Number of sequences: 3516
Number of targets: 5
Minimum number of instances: 11
Maximum number of instances: 2038

Data summary:
Target: HN | Number of sequences: 497
Target: MX | Number of sequences: 2038
Target: PA | Number of sequences: 770
Target: SV | Number of sequences: 200
Target: BZ | Number of sequences: 11

Information of the first 5 samples:
ID: AF096658 | Sequence: TCCCAACAAAGACAAGACATCCTTGATCTGTGGGTCTACAACACACAAGG | Annotations: {'target': 'HN', 'subtype': 'B', 'country': 'HN'}
ID: AF096659 | Sequence: TCCCAGAAAAGACAGGACATCCTTGATTTATGGGTCTACCACACACAAGG | Annotations: {'target': 'HN', 'subtype': 'B', 'country': 'HN'}
ID: AF096660 | Sequence: TCCCAGAAAAGACAAGACATCCTTGATCTGTGGGTCTACAACACACAAGG | Annotations: {'target': 'HN', 'subtype': 'B', 'country': 'HN'}
ID: AF096661 | Sequence: TCCCAAAAAAGACAAGACATCCTTGATCTGTGGATCTACCACACACAAGG | Annotations: {'target': 'HN', 'subtype': 'B', 'country': 'HN'}
ID: AF096662 | Sequence: TCCCAAAAAAGACAAGACATCCTTGATCTGTGGATCTACCACACACAAGG | Annotations: {'target': 'HN', 'subtype': 'B', 'country': 'HN'}
In [9]:
##############################################
#####      TRAIN / TEST DATA SPLIT       #####
##############################################
# Initialise train/test tables that will contain the data
train_data = []
test_data = []
# Initialise train/test dictionaries that will contain the number of instances for each target
test_split = {}
train_split = {}
# Initialise the dictionary with the targets keys and the value 0
test_split = test_split.fromkeys(targets.keys(), 0)
train_split = train_split.fromkeys(targets.keys(), 0)
# Shuffle the data
shuffle(data)
In [10]:
# Iterate through the data
for d in data:
    # Get this records's target class
    target = d.annotations["target"]
    # For sampling purposes: train/test threshold is based on n_samples if there is too much records for this target
    threshold = min(targets[target], n_samples) * split_raito
    # Until threshold for this target is reached, fills train data
    if train_split[target] < threshold: 
        train_data.append(d)
        train_split[target] += 1
    # Then, fills test data (until eventually n_samples are collected)
    elif test_split[target] < n_samples * (1-split_raito): 
        test_data.append(d)
        test_split[target] += 1
# Shuffle the data
shuffle(train_data)
shuffle(test_data)
In [11]:
# Data summary of the train/test split
print("\nTrain/Test split summary:")
for train_key, test_key in zip(train_split.keys(), test_split.keys()):
    print("Target:", train_key, "| Train instances:", train_split[train_key], "| Test instances:", test_split[test_key])
print("\nTotal number of training instances:", len(train_data))
print("Total number of testing instances:", len(test_data))
Train/Test split summary:
Target: HN | Train instances: 398 | Test instances: 99
Target: MX | Train instances: 1600 | Test instances: 400
Target: PA | Train instances: 616 | Test instances: 154
Target: SV | Train instances: 160 | Test instances: 40
Target: BZ | Train instances: 9 | Test instances: 2

Total number of training instances: 2783
Total number of testing instances: 695
In [12]:
##################################################
#####  FEATURES GENERATION BASED ON K-MERS   #####
##################################################
print("\n         FEATURES GENERATION         ")
print("=====================================")
         FEATURES GENERATION         
=====================================
In [13]:
# Initialize an empty dictionary for the k-mers motifs features
instances = {}
# Used to show progress
progress = ProgressBar()
# Iterate through the training data
for d in train_data:
    # Go through the sequence 
    for i in range(0, len(d.seq) - k + 1, 1):
        # Get the current k-mer motif feature
        feature = str(d.seq[i:i + k])
        # If it contains only the characters "A", "C", "G" or "T", it will be saved
        if re.match('^[ACGT]+$', feature): 
            instances[feature] = 0
    progress.update(len(instances))
    # No need to keep going if motifs dictonary reaches max size
    if len(instances) == 4 ** k:
        break
# Used to show progress
progress.finish()
# Save dictonary keys as biopython motifs object
motifs = create(instances.keys())
# Display the number of features
print("\nNumber of features:", len(motifs.instances), "\n")
| |           #                                    | 1023 Elapsed Time: 0:00:08

Number of features: 1023 

In [14]:
######################################################################
##### GENERATION OF THE FEATURE MATRIX (x) AND TARGET VECTOR (y) #####
######################################################################
In [15]:
# Function to generate feature matrix and target vector
def generateFeatures(data):
    # Initialize the feature matrix
    X = []
    # Initialize the target vector
    y = []
    # Used to show progress
    progress = ProgressBar()
    # Iterate through the data
    for d in progress(data):
        # Generate an empty dictionary
        x = {}
        # Initialize the dictionary with targets as keys and 0 as value
        x = x.fromkeys(motifs.instances, 0)
        # Compute X (features matrix): the number of occurrence of k-mers (with overlaping)
        for i in range(0, len(d.seq) - k + 1, 1):
            feature = d.seq[i:i + k]
            # Attempt to increment the number of occurrences of the current k-mer feature
            try: x[feature] += 1
            # It could fail because the current k-mer is not full ACGT
            except: pass
        # Save the features vector in the features matrix
        X.append(list(x.values()))
        # Save the target class in the target vector
        y.append(d.annotations["target"])
    # Return matrices X and y (feature matrix and target vector)
    return X, y
In [16]:
# Generate train/test feature matrices and target vectors
x_train, y_train = generateFeatures(train_data)
x_test, y_test = generateFeatures(test_data)
100% (2783 of 2783) |####################| Elapsed Time: 0:00:10 Time:  0:00:10
100% (695 of 695) |######################| Elapsed Time: 0:00:02 Time:  0:00:02
In [17]:
# Function to generate feature matrix and target vector based on k-mer frequency, not the sum
def generateFreqFeatures(x_sum):
    X = []
    for x in x_sum:
        total = sum(x)
        X.append(list(map((lambda i: i / total), x)))
    return X
In [18]:
# If Freq is ture, then the features matrix are frequency of k-mers, not their sum
if freq:
    x_train = generateFreqFeatures(x_train)
    x_test = generateFreqFeatures(x_test)
In [19]:
##############################################
#####       FEATURES NORMALISATION       #####
##############################################
In [20]:
# Instantiate a MinMaxScaler between 0 and 1
minMaxScaler = MinMaxScaler(feature_range = (0,1))
# Apply a scaling to the train and test set
x_train = minMaxScaler.fit_transform(x_train)
x_test = minMaxScaler.fit_transform(x_test)
In [21]:
##############################################
#####         FEATURES SELECTION         #####
##############################################
print("\n         FEATURES SELECTION          ")
print("=====================================")
         FEATURES SELECTION          
=====================================
In [22]:
# Instantiate a linear model based on svm
model = svm.SVC(C = 1.0, kernel='linear', class_weight = None)
# Instantiate the RFE
rfe = RFE(model, n_features_to_select = n_features, step = step, verbose=True)
# Apply RFE and transform the training matrix
x_train = rfe.fit_transform(x_train, y_train)
# Tranform the test matrix (will be useed later for evaluation purposes)
x_test = rfe.transform(x_test)
Fitting estimator with 1023 features.
Fitting estimator with 1018 features.
Fitting estimator with 1013 features.
Fitting estimator with 1008 features.
Fitting estimator with 1003 features.
Fitting estimator with 998 features.
Fitting estimator with 993 features.
Fitting estimator with 988 features.
Fitting estimator with 983 features.
Fitting estimator with 978 features.
Fitting estimator with 973 features.
Fitting estimator with 968 features.
Fitting estimator with 963 features.
Fitting estimator with 958 features.
Fitting estimator with 953 features.
Fitting estimator with 948 features.
Fitting estimator with 943 features.
Fitting estimator with 938 features.
Fitting estimator with 933 features.
Fitting estimator with 928 features.
Fitting estimator with 923 features.
Fitting estimator with 918 features.
Fitting estimator with 913 features.
Fitting estimator with 908 features.
Fitting estimator with 903 features.
Fitting estimator with 898 features.
Fitting estimator with 893 features.
Fitting estimator with 888 features.
Fitting estimator with 883 features.
Fitting estimator with 878 features.
Fitting estimator with 873 features.
Fitting estimator with 868 features.
Fitting estimator with 863 features.
Fitting estimator with 858 features.
Fitting estimator with 853 features.
Fitting estimator with 848 features.
Fitting estimator with 843 features.
Fitting estimator with 838 features.
Fitting estimator with 833 features.
Fitting estimator with 828 features.
Fitting estimator with 823 features.
Fitting estimator with 818 features.
Fitting estimator with 813 features.
Fitting estimator with 808 features.
Fitting estimator with 803 features.
Fitting estimator with 798 features.
Fitting estimator with 793 features.
Fitting estimator with 788 features.
Fitting estimator with 783 features.
Fitting estimator with 778 features.
Fitting estimator with 773 features.
Fitting estimator with 768 features.
Fitting estimator with 763 features.
Fitting estimator with 758 features.
Fitting estimator with 753 features.
Fitting estimator with 748 features.
Fitting estimator with 743 features.
Fitting estimator with 738 features.
Fitting estimator with 733 features.
Fitting estimator with 728 features.
Fitting estimator with 723 features.
Fitting estimator with 718 features.
Fitting estimator with 713 features.
Fitting estimator with 708 features.
Fitting estimator with 703 features.
Fitting estimator with 698 features.
Fitting estimator with 693 features.
Fitting estimator with 688 features.
Fitting estimator with 683 features.
Fitting estimator with 678 features.
Fitting estimator with 673 features.
Fitting estimator with 668 features.
Fitting estimator with 663 features.
Fitting estimator with 658 features.
Fitting estimator with 653 features.
Fitting estimator with 648 features.
Fitting estimator with 643 features.
Fitting estimator with 638 features.
Fitting estimator with 633 features.
Fitting estimator with 628 features.
Fitting estimator with 623 features.
Fitting estimator with 618 features.
Fitting estimator with 613 features.
Fitting estimator with 608 features.
Fitting estimator with 603 features.
Fitting estimator with 598 features.
Fitting estimator with 593 features.
Fitting estimator with 588 features.
Fitting estimator with 583 features.
Fitting estimator with 578 features.
Fitting estimator with 573 features.
Fitting estimator with 568 features.
Fitting estimator with 563 features.
Fitting estimator with 558 features.
Fitting estimator with 553 features.
Fitting estimator with 548 features.
Fitting estimator with 543 features.
Fitting estimator with 538 features.
Fitting estimator with 533 features.
Fitting estimator with 528 features.
Fitting estimator with 523 features.
Fitting estimator with 518 features.
Fitting estimator with 513 features.
Fitting estimator with 508 features.
Fitting estimator with 503 features.
Fitting estimator with 498 features.
Fitting estimator with 493 features.
Fitting estimator with 488 features.
Fitting estimator with 483 features.
Fitting estimator with 478 features.
Fitting estimator with 473 features.
Fitting estimator with 468 features.
Fitting estimator with 463 features.
Fitting estimator with 458 features.
Fitting estimator with 453 features.
Fitting estimator with 448 features.
Fitting estimator with 443 features.
Fitting estimator with 438 features.
Fitting estimator with 433 features.
Fitting estimator with 428 features.
Fitting estimator with 423 features.
Fitting estimator with 418 features.
Fitting estimator with 413 features.
Fitting estimator with 408 features.
Fitting estimator with 403 features.
Fitting estimator with 398 features.
Fitting estimator with 393 features.
Fitting estimator with 388 features.
Fitting estimator with 383 features.
Fitting estimator with 378 features.
Fitting estimator with 373 features.
Fitting estimator with 368 features.
Fitting estimator with 363 features.
Fitting estimator with 358 features.
Fitting estimator with 353 features.
Fitting estimator with 348 features.
Fitting estimator with 343 features.
Fitting estimator with 338 features.
Fitting estimator with 333 features.
Fitting estimator with 328 features.
Fitting estimator with 323 features.
Fitting estimator with 318 features.
Fitting estimator with 313 features.
Fitting estimator with 308 features.
Fitting estimator with 303 features.
Fitting estimator with 298 features.
Fitting estimator with 293 features.
Fitting estimator with 288 features.
Fitting estimator with 283 features.
Fitting estimator with 278 features.
Fitting estimator with 273 features.
Fitting estimator with 268 features.
Fitting estimator with 263 features.
Fitting estimator with 258 features.
Fitting estimator with 253 features.
Fitting estimator with 248 features.
Fitting estimator with 243 features.
Fitting estimator with 238 features.
Fitting estimator with 233 features.
Fitting estimator with 228 features.
Fitting estimator with 223 features.
Fitting estimator with 218 features.
Fitting estimator with 213 features.
Fitting estimator with 208 features.
Fitting estimator with 203 features.
Fitting estimator with 198 features.
Fitting estimator with 193 features.
Fitting estimator with 188 features.
Fitting estimator with 183 features.
Fitting estimator with 178 features.
Fitting estimator with 173 features.
Fitting estimator with 168 features.
Fitting estimator with 163 features.
Fitting estimator with 158 features.
Fitting estimator with 153 features.
Fitting estimator with 148 features.
Fitting estimator with 143 features.
Fitting estimator with 138 features.
Fitting estimator with 133 features.
Fitting estimator with 128 features.
Fitting estimator with 123 features.
Fitting estimator with 118 features.
Fitting estimator with 113 features.
Fitting estimator with 108 features.
Fitting estimator with 103 features.
In [23]:
# Compute the reduction percentage of the feature matrix
reduction_percentage = ((len(motifs.instances) - n_features) / len(motifs.instances) * 100)
# Print the reduction percentage
print("\nReduction percentage:", round(reduction_percentage, 2), "%")
Reduction percentage: 90.22 %
In [24]:
# Initialize the table that will contain the selected features
instances = []
# Save selected k-mers features
for i, mask in enumerate(rfe.support_): 
    if mask == True: instances.append(motifs.instances[i])
# Save table as biopython motifs object
features = create(instances)
In [25]:
##############################################
#####    TRAINING DATA VISUALISATION     #####
##############################################
print("\n     TRAINING DATA VISUALISATION     ")
print("=====================================")
     TRAINING DATA VISUALISATION     
=====================================
In [26]:
# Define the function to draw Scatter Plot
def generateScatterPlot(title, figure_width, figure_height, data, X, y):
    # If 2d dimensions
    if n_components == 2:
        # Initialize a 2-dimensional figure
        fig, ax = plt.subplots(figsize=(figure_width, figure_height))
    # If 3d dimensions
    else:
        # Initialize a 3-dimensional figure
        fig = plt.figure(figsize=(15, 10))
        ax = Axes3D(fig)
    # List of markers
    markers = ["o","+", "^", "x"]
    # List of colors
    colors = ["tab:blue", "tab:orange", 
              "tab:green", "tab:red", 
              "tab:purple", "tab:brown", 
              "tab:pink", "tab:grey", 
              "tab:olive", "tab:cyan",]
    
    # Iterate through the targets
    for i, target in enumerate(y):
        # Set the list of axis positions
        x = []
        y = []
        z = []
        # If the number of targets is less than 10
        if i < 10:
            color = colors[i]
            marker = markers[0]
        # If the number of targets is less than 20
        elif i < 20:
            color = colors[i-10]
            marker = markers[1]
        # If the number of targets is less than 30
        elif i < 30:
            color = colors[i-20]
            marker = markers[2]
        # If the number of targets is less than 40
        else:
            color = colors[i-30]
            marker = markers[3]
            
        # Iterate through the data
        for i, d in enumerate(data):
            # If the sequence belongs to the target of interest
            if d.annotations["target"] == target:
                # Save the value of the positions
                x.append(X[i][0])
                y.append(X[i][1])
                if n_components == 3: z.append(X[i][2])
              
        # Add the current scatter plot to the figure
        if n_components == 2:
            ax.scatter(x, y, c = color, label = target, alpha = 0.75, edgecolors = 'none', marker=marker)
        else:
            ax.scatter(x, y, z, c = color, label=target,alpha=0.75, edgecolors='none', marker=marker)

    # Display the grid
    ax.grid(True)
    # Set the legend parameters
    ax.legend(loc = 2, prop = {'size': 10})
    # Set the tite
    plt.title(title)
    # Set axes labels
    if n_components == 2:
        plt.xlabel('PC1')
        plt.ylabel('PC2')
    else: 
        ax.set_xlabel('PC1')
        ax.set_ylabel('PC2')
        ax.set_zlabel('PC3')
    # Displqy the figure
    plt.show()
In [27]:
# Instantiate a TSNE with 3 principal components
tsne = TSNE(n_components = 3, perplexity = 50, verbose=True)
# Apply TSNE to X_train
x_tsne = tsne.fit_transform(x_train)
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 2783 samples in 0.001s...
[t-SNE] Computed neighbors for 2783 samples in 0.471s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2783
[t-SNE] Computed conditional probabilities for sample 2000 / 2783
[t-SNE] Computed conditional probabilities for sample 2783 / 2783
[t-SNE] Mean sigma: 0.380594
[t-SNE] KL divergence after 250 iterations with early exaggeration: 64.830734
[t-SNE] KL divergence after 1000 iterations: 1.258554
In [28]:
# Generate scatter plot of a TSNE
generateScatterPlot(title= "Scatter plot of a two-dimensional TSNE applied to the training data", 
                    figure_width = 15, 
                    figure_height = 12, 
                    data = train_data, 
                    X = x_tsne, 
                    y = set(y_train))
2021-05-02T22:37:58.980194 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
In [29]:
# Instantiate PCA with 3 principal components
pca = PCA(n_components = 3)
x_pca =  pca.fit_transform(x_train)
In [30]:
# Generate scatter plot of a PCA
generateScatterPlot(title= "Scatter plot of a two-dimensional PCA applied to the training data", 
                    figure_width = 15, 
                    figure_height = 12, 
                    data = train_data, 
                    X = x_pca, 
                    y = set(y_train))
2021-05-02T22:38:01.269262 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
In [31]:
##############################################
#####   MODEL TRAINING AND PREDICTION    #####
##############################################
print("\n    MODEL TRAINING AND PREDICTION    ")
print("=====================================")
    MODEL TRAINING AND PREDICTION    
=====================================
In [32]:
# Fit the model on the train set
model.fit(x_train, y_train)
# Save the model to filename model_name
joblib.dump(model, model_name)
Out[32]:
['cam.pkl']
In [33]:
# Predict with model on the test set
y_pred = model.predict(x_test)
# Display prediction
print("Predictions (" + str(len(y_pred)) + "):", y_pred)
Predictions (695): ['PA' 'PA' 'PA' 'HN' 'MX' 'HN' 'PA' 'MX' 'MX' 'MX' 'PA' 'MX' 'MX' 'MX'
 'PA' 'MX' 'HN' 'MX' 'HN' 'MX' 'MX' 'PA' 'MX' 'MX' 'MX' 'MX' 'MX' 'SV'
 'SV' 'HN' 'MX' 'HN' 'MX' 'MX' 'HN' 'PA' 'MX' 'PA' 'PA' 'MX' 'PA' 'SV'
 'MX' 'PA' 'MX' 'HN' 'PA' 'MX' 'MX' 'PA' 'PA' 'PA' 'MX' 'MX' 'MX' 'MX'
 'PA' 'PA' 'HN' 'MX' 'PA' 'MX' 'PA' 'PA' 'MX' 'PA' 'MX' 'HN' 'MX' 'MX'
 'HN' 'MX' 'MX' 'PA' 'MX' 'HN' 'HN' 'MX' 'MX' 'MX' 'MX' 'MX' 'MX' 'PA'
 'SV' 'MX' 'MX' 'PA' 'PA' 'MX' 'MX' 'MX' 'MX' 'HN' 'PA' 'MX' 'PA' 'MX'
 'SV' 'HN' 'MX' 'PA' 'MX' 'PA' 'HN' 'PA' 'MX' 'MX' 'HN' 'HN' 'MX' 'MX'
 'HN' 'MX' 'MX' 'MX' 'SV' 'MX' 'PA' 'HN' 'MX' 'MX' 'MX' 'MX' 'MX' 'MX'
 'SV' 'MX' 'PA' 'MX' 'PA' 'HN' 'MX' 'MX' 'MX' 'MX' 'MX' 'MX' 'MX' 'MX'
 'MX' 'MX' 'MX' 'SV' 'PA' 'PA' 'MX' 'MX' 'MX' 'MX' 'MX' 'MX' 'MX' 'MX'
 'MX' 'HN' 'PA' 'HN' 'HN' 'MX' 'MX' 'MX' 'HN' 'MX' 'PA' 'HN' 'MX' 'PA'
 'HN' 'PA' 'MX' 'HN' 'MX' 'MX' 'HN' 'PA' 'PA' 'MX' 'PA' 'MX' 'MX' 'MX'
 'PA' 'PA' 'SV' 'MX' 'MX' 'MX' 'MX' 'MX' 'MX' 'MX' 'MX' 'HN' 'PA' 'PA'
 'PA' 'PA' 'MX' 'MX' 'MX' 'HN' 'HN' 'MX' 'MX' 'MX' 'MX' 'MX' 'HN' 'MX'
 'PA' 'MX' 'MX' 'MX' 'MX' 'SV' 'MX' 'SV' 'MX' 'PA' 'PA' 'MX' 'PA' 'MX'
 'HN' 'PA' 'MX' 'PA' 'MX' 'MX' 'SV' 'HN' 'MX' 'MX' 'MX' 'MX' 'MX' 'PA'
 'MX' 'PA' 'HN' 'HN' 'PA' 'PA' 'HN' 'MX' 'MX' 'MX' 'PA' 'HN' 'MX' 'MX'
 'HN' 'MX' 'HN' 'MX' 'SV' 'MX' 'MX' 'PA' 'HN' 'HN' 'MX' 'MX' 'PA' 'MX'
 'MX' 'MX' 'PA' 'MX' 'MX' 'HN' 'MX' 'PA' 'PA' 'MX' 'MX' 'HN' 'PA' 'SV'
 'MX' 'PA' 'MX' 'PA' 'MX' 'MX' 'PA' 'HN' 'PA' 'MX' 'PA' 'MX' 'PA' 'MX'
 'HN' 'MX' 'MX' 'HN' 'MX' 'MX' 'MX' 'MX' 'MX' 'MX' 'MX' 'MX' 'MX' 'PA'
 'MX' 'MX' 'MX' 'HN' 'PA' 'MX' 'HN' 'PA' 'PA' 'PA' 'MX' 'HN' 'PA' 'MX'
 'MX' 'HN' 'PA' 'MX' 'MX' 'HN' 'HN' 'SV' 'SV' 'HN' 'HN' 'MX' 'HN' 'MX'
 'MX' 'PA' 'PA' 'MX' 'MX' 'MX' 'MX' 'PA' 'SV' 'MX' 'PA' 'MX' 'MX' 'HN'
 'MX' 'MX' 'MX' 'PA' 'PA' 'PA' 'PA' 'MX' 'MX' 'MX' 'PA' 'MX' 'MX' 'HN'
 'PA' 'MX' 'MX' 'MX' 'PA' 'MX' 'HN' 'MX' 'SV' 'MX' 'MX' 'MX' 'MX' 'MX'
 'MX' 'HN' 'HN' 'MX' 'MX' 'MX' 'HN' 'MX' 'HN' 'SV' 'MX' 'PA' 'HN' 'MX'
 'SV' 'PA' 'MX' 'MX' 'MX' 'MX' 'PA' 'HN' 'MX' 'MX' 'MX' 'PA' 'MX' 'MX'
 'MX' 'PA' 'MX' 'HN' 'HN' 'HN' 'HN' 'MX' 'MX' 'PA' 'MX' 'MX' 'MX' 'MX'
 'MX' 'MX' 'PA' 'HN' 'HN' 'MX' 'MX' 'PA' 'MX' 'PA' 'MX' 'MX' 'HN' 'MX'
 'MX' 'HN' 'MX' 'MX' 'PA' 'MX' 'MX' 'HN' 'MX' 'PA' 'PA' 'MX' 'HN' 'MX'
 'MX' 'MX' 'MX' 'HN' 'MX' 'MX' 'MX' 'HN' 'MX' 'HN' 'MX' 'MX' 'HN' 'MX'
 'MX' 'PA' 'PA' 'MX' 'MX' 'MX' 'MX' 'MX' 'MX' 'MX' 'HN' 'MX' 'MX' 'MX'
 'MX' 'MX' 'HN' 'MX' 'MX' 'PA' 'MX' 'HN' 'PA' 'PA' 'MX' 'MX' 'PA' 'MX'
 'MX' 'PA' 'MX' 'MX' 'MX' 'MX' 'HN' 'MX' 'MX' 'SV' 'PA' 'MX' 'MX' 'MX'
 'SV' 'SV' 'HN' 'MX' 'MX' 'PA' 'MX' 'MX' 'MX' 'MX' 'MX' 'MX' 'MX' 'SV'
 'HN' 'MX' 'PA' 'MX' 'PA' 'MX' 'PA' 'MX' 'MX' 'MX' 'MX' 'MX' 'MX' 'PA'
 'MX' 'MX' 'MX' 'HN' 'MX' 'MX' 'MX' 'MX' 'PA' 'PA' 'MX' 'MX' 'HN' 'MX'
 'MX' 'MX' 'MX' 'MX' 'MX' 'MX' 'MX' 'HN' 'HN' 'MX' 'PA' 'MX' 'MX' 'MX'
 'MX' 'MX' 'MX' 'PA' 'MX' 'PA' 'MX' 'SV' 'HN' 'PA' 'PA' 'HN' 'PA' 'HN'
 'MX' 'MX' 'MX' 'HN' 'SV' 'MX' 'MX' 'MX' 'PA' 'MX' 'MX' 'PA' 'PA' 'HN'
 'HN' 'MX' 'MX' 'PA' 'HN' 'MX' 'MX' 'HN' 'MX' 'MX' 'HN' 'PA' 'PA' 'HN'
 'MX' 'MX' 'MX' 'PA' 'PA' 'PA' 'MX' 'HN' 'MX' 'MX' 'MX' 'PA' 'MX' 'PA'
 'MX' 'MX' 'MX' 'HN' 'HN' 'MX' 'MX' 'HN' 'MX' 'MX' 'PA' 'MX' 'PA' 'PA'
 'PA' 'PA' 'MX' 'PA' 'MX' 'MX' 'MX' 'MX' 'MX' 'PA' 'PA' 'MX' 'MX' 'MX'
 'HN' 'MX' 'HN' 'MX' 'MX' 'MX' 'MX' 'PA' 'PA' 'MX' 'MX' 'HN' 'PA' 'MX'
 'PA' 'MX' 'MX' 'HN' 'PA' 'MX' 'MX' 'MX' 'MX' 'PA' 'PA' 'MX' 'PA' 'MX'
 'PA' 'MX' 'MX' 'MX' 'HN' 'MX' 'MX' 'MX' 'HN' 'MX' 'SV' 'PA' 'MX' 'MX'
 'MX' 'MX' 'PA' 'MX' 'MX' 'PA' 'PA' 'MX' 'MX']
In [34]:
##############################################
#####  MODEL PREDICTIONS VISUALISATION   #####
##############################################
print("\n   MODEL PREDICTIONS VISUALISATION   ")
print("=====================================")
   MODEL PREDICTIONS VISUALISATION   
=====================================
In [35]:
# Will contain correct and incorrect data seq_records objects
correct_data = []
incorrect_data = []
# Will contain correct and incorrect features vectors (just like x_test)
correct_features = []
incorrect_features = []
# Iterate through test data
for i, d in enumerate(test_data):
    # Add an annotation to all test data stating its percentage range of ACGT characters
    total_char = len(d.seq)
    total_acgt = 0
    for char in d.seq:
        if re.match('^[ACGT]+$', char):
            total_acgt += 1
    acgt_percent = total_acgt / total_char
    if acgt_percent >= 0.75: d.annotations["acgt-percent"] = "75-100"
    elif acgt_percent >= 0.50: d.annotations["acgt-percent"] = "50-75"
    elif acgt_percent >= 0.25: d.annotations["acgt-percent"] = "25-50"
    else: d.annotations["acgt-percent"] = "0-25"
    # Split test data into correct and incorrect sets depending on prediction results
    if y_pred[i] == d.annotations["target"]:
        correct_data.append(d)
        correct_features.append(x_test[i])
    else:
        # If it's incorrect, add the prediction class as an annotation
        d.annotations["prediction"] = y_pred[i]
        incorrect_data.append(d)
        incorrect_features.append(x_test[i])
In [36]:
# Print the classification_report
print(classification_report(y_test, y_pred, digits = 3))
              precision    recall  f1-score   support

          BZ      0.000     0.000     0.000         2
          HN      0.809     0.899     0.852        99
          MX      0.955     0.963     0.959       400
          PA      0.942     0.948     0.945       154
          SV      0.778     0.525     0.627        40

    accuracy                          0.922       695
   macro avg      0.697     0.667     0.676       695
weighted avg      0.919     0.922     0.919       695

C:\Users\Riccardo\AppData\Roaming\Python\Python39\site-packages\sklearn\metrics\_classification.py:1248: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
C:\Users\Riccardo\AppData\Roaming\Python\Python39\site-packages\sklearn\metrics\_classification.py:1248: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
C:\Users\Riccardo\AppData\Roaming\Python\Python39\site-packages\sklearn\metrics\_classification.py:1248: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
In [37]:
# Dictonaries with pair of annotation -> number of incorrect records with this annotation
subtypes = {}
countries = {}
predictions = {}
acgt_percents = {}
# Iterate through incorrect data
for i in incorrect_data:
    # Increment each kind of annotation with current record values as keys
    subtypes[i.annotations["subtype"]] = subtypes.get(i.annotations["subtype"], 0) + 1
    countries[i.annotations["country"]] = countries.get(i.annotations["country"], 0) + 1
    predictions[i.annotations["prediction"]] = predictions.get(i.annotations["prediction"], 0) + 1
    acgt_percents[i.annotations["acgt-percent"]] = acgt_percents.get(i.annotations["acgt-percent"], 0) + 1
# Display number of incorrect records for each annotation, useful to spot any pattern here
print("Incorrect predictions annotations:")
print("Subtype:", subtypes)
print("Country:", countries)
print("Prediction:", predictions)
print("ACGT percent:", acgt_percents)
Incorrect predictions annotations:
Subtype: {'B': 52, '-': 2}
Country: {'SV': 19, 'PA': 8, 'MX': 15, 'HN': 10, 'BZ': 2}
Prediction: {'PA': 9, 'MX': 18, 'HN': 21, 'SV': 6}
ACGT percent: {'75-100': 54}
In [38]:
# Compute the confusion matrix
matrix = confusion_matrix(y_true = y_test, y_pred = y_pred, labels=sorted(targets.keys()))
# Build the heatmap
fig, ax = plt.subplots(figsize=(15, 10))
sns.heatmap(matrix, 
            cmap = 'Blues', 
            annot = True, 
            fmt = ".0f", 
            linewidth = 0.1, 
            xticklabels = sorted(targets.keys()), 
            yticklabels = sorted(targets.keys()))
plt.title("Confusion matrix")
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.show()
2021-05-02T22:38:06.983318 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
In [39]:
# Show percentage of occurence of all features for all target classes in both train and test data
matrix = []
# Iterate through features
for i, feature in enumerate(features.instances):
    # Generate an empty dictionary
    x = {}
    # Initialize the dictionary with targets as keys and 0 as value
    x = x.fromkeys(targets.keys(), 0)
    # Count in all train data
    for f, d in zip(x_train, train_data):
        if f[i] > 0: x[d.annotations["target"]] += 1
    # Count in all test data
    for f, d in zip(x_test, test_data):
        if f[i] > 0: x[d.annotations["target"]] += 1
    # Vector of attendance percentage
    vector = []
    # Iterate through the number of instances and the number of occurrences
    for n_instances, n_occurrences in zip(targets.values(), x.values()):
        n_instances = min(n_instances, n_samples)
        # Compute the percentage of k-mers attendance by target
        attendance_percentage = 100 - ((n_instances - n_occurrences) / n_instances * 100)
        # Save the attendance percentage in the specitic vector
        vector.append(int(attendance_percentage))
    # Save the vector of attendance percentage in the heatmap matrix
    matrix.append(vector)
# Build the heatmap
fig, ax = plt.subplots(figsize=(15, 20))
sns.heatmap(matrix, 
            annot = True, 
            fmt = ".0f", 
            cmap = 'Blues_r',
            linewidth = 0.1, 
            xticklabels = targets.keys(), 
            yticklabels = features.instances)
plt.title("Percentage of presence of k-mers according to HIV subtypes")
plt.xlabel("Target")
plt.ylabel("Features")
plt.show()
2021-05-02T22:38:14.600474 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
In [40]:
# For all incorrect records, compute average feature vectors of all correct records for both true and predicted classes
for i_data, i_features in zip(incorrect_data[0:max_incorrect], incorrect_features[0:max_incorrect]):
    # Both matrices to plot
    true_features = []
    pred_features = []
    # Iterate through correct records
    for c_data, c_features in zip(correct_data, correct_features):
        # Compare only if both records are somewhat similar (either same subtype or acgt-percentage range)
        #if i_data.annotations["subtype"] == c_data.annotations["subtype"] or i_data.annotations["acgt-percent"] == c_data.annotations["acgt-percent"]:
        # If this correct record is in the same class as current incorrect record
        if i_data.annotations["target"] == c_data.annotations["target"]:
            true_features.append(c_features)
        # If this correct record is in the class that the current incorrect record has been predicted to  
        if i_data.annotations["prediction"] == c_data.annotations["target"]:
            pred_features.append(c_features)
    # Compute avergare matrices only if similar correct records are found (avoid div per 0)
    if len(true_features) != 0 and len(pred_features) != 0:
        true_features_mean = np.array(true_features).mean(axis=0)
        pred_features_mean = np.array(pred_features).mean(axis=0)
        # Build the heatmap
        fig, ax = plt.subplots(figsize=(40,5))
        sns.heatmap([true_features_mean, i_features, pred_features_mean], 
                #annot = True, 
                #fmt = ".0f", 
                linewidth = 0.1,
                cmap = 'Blues',
                xticklabels = features.instances,
                yticklabels = ["True", "Incorrect", "Prediction"],)
        plt.title("Comparaison of incorrect features vector with true and predicted features vectors averages")
        plt.xlabel("Features")
        plt.show()
2021-05-02T22:38:27.089345 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-02T22:38:33.053255 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-02T22:38:38.497106 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-02T22:38:44.747905 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-02T22:38:50.648947 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-02T22:38:56.465929 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-02T22:39:02.087854 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-02T22:39:06.433078 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-02T22:39:10.606693 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-02T22:39:15.117763 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
In [41]:
# For all incorrect records, compare apparence percentage of all correct records in both true and predicted classes
for i_data, i_features in zip(incorrect_data[0:max_incorrect], incorrect_features[0:max_incorrect]):
    # Dictionaries containing nb of occurences of features for all correct records
    true_features = {}
    pred_features = {}
    true_features = true_features.fromkeys(features.instances, 0)
    pred_features = pred_features.fromkeys(features.instances, 0)
    true_total = 0
    pred_total = 0
    # Iterate through correct records
    for c_data, c_features in zip(correct_data, correct_features):
        # Compare only if both records are somewhat similar (either same subtype or acgt-percentage range)
        #if i_data.annotations["subtype"] == c_data.annotations["subtype"] or i_data.annotations["acgt-percent"] == c_data.annotations["acgt-percent"]:
        # If this correct record is in the same class as current incorrect record
        if i_data.annotations["target"] == c_data.annotations["target"]:
            true_total += 1
            for value, key in zip(c_features, features.instances):
                if value > 0: true_features[key] += 1
        # If this correct record is in the class that the current incorrect record has been predicted to  
        if i_data.annotations["prediction"] == c_data.annotations["target"]:
            pred_total += 1
            for value, key in zip(c_features, features.instances):
                if value > 0: pred_features[key] += 1
    # Compute avergare matrices only if similar correct records are found (avoid div per 0)
    if true_total != 0 and pred_total != 0:
        true_vector = list(map((lambda i: i / true_total), true_features.values()))
        pred_vector = list(map((lambda i: i / pred_total), pred_features.values()))
        # Build the heatmap
        fig, ax = plt.subplots(figsize=(40,5))
        sns.heatmap([true_vector, i_features, pred_vector], 
                #annot = True, 
                #fmt = ".0f", 
                linewidth = 0.1,
                cmap = 'Blues_r',
                xticklabels = features.instances,
                yticklabels = ["True", "Incorrect", "Prediction"],)
        plt.title("Comparaison of incorrect features vector with true and predicted vectors of occurences percents")
        plt.xlabel("Features")
        plt.show()
2021-05-02T22:39:20.179258 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-02T22:39:25.032807 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-02T22:39:29.292690 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-02T22:39:33.537860 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-02T22:39:37.909151 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-02T22:39:42.034594 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-02T22:39:46.051482 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-02T22:39:50.677523 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-02T22:39:55.271276 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-02T22:39:59.622586 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
In [42]:
# Compute alignement of all incorrect records to all correct record and compute avegarge of scores
print("\nComparison of alignement scores between true and predicted class:")
ids = []
matrix = []
# Used to show progress
progress = ProgressBar(max_value=len(incorrect_data[0:max_incorrect])*len(correct_data[0:max_correct])).start()
count = 0
# Shuffle correct data (when we're sampling it)
shuffle(correct_data)
# Iterate through incorrect data
for i in incorrect_data[0:max_incorrect]:
    # Keep different averages for same target class and predicted target class of incorrect record
    true_score_sum = 0
    true_score_nb = 0
    pred_score_sum = 0
    pred_score_nb = 0
    # Iterate through correct data
    for c in correct_data[0:max_correct]:
        # Compare only if both records are somewhat in the same category (both same subtype and acgt-percentage range)
        #if i.annotations["subtype"] == c.annotations["subtype"] and i.annotations["acgt-percent"] == c.annotations["acgt-percent"]:
        # If this correct record is in the same class as current incorrect record
        if i.annotations["target"] == c.annotations["target"]:
            true_score_sum += pairwise2.align.globalxx(i.seq, c.seq, score_only=True)
            true_score_nb += 1
        # If this correct record is in the class that the current incorrect record has been predicted to
        if i.annotations["prediction"] == c.annotations["target"]:
            pred_score_sum += pairwise2.align.globalxx(i.seq, c.seq, score_only=True)
            pred_score_nb += 1
        # Used to show progress
        count += 1
        progress.update(count)
    # Compute avergare only if similar correct records are found (avoid div per 0)
    if true_score_nb != 0 and pred_score_nb != 0:
        ids.append(i.id)
        matrix.append([true_score_sum/true_score_nb, pred_score_sum/pred_score_nb])
# Normalise results
matrix = pd.DataFrame(np.array(matrix))
matrix = matrix.div(matrix.max(axis=1), axis=0)
# Build the heatmap
fig, ax = plt.subplots()
sns.heatmap(matrix, 
            #annot = True, 
            #fmt = ".0f", 
            linewidth = 0.1,
            cmap = 'Blues',
            xticklabels = ["True", "Prediction"], 
            yticklabels = ids)
plt.title("Comparison of alignement scores between true and predicted class")
plt.xlabel("Target")
plt.ylabel("ID")
plt.show()
  0% (9 of 6410) |                       | Elapsed Time: 0:00:00 ETA:   0:02:05
Comparison of alignement scores between true and predicted class:
100% (6410 of 6410) |####################| Elapsed Time: 0:01:23 ETA:  00:00:00
2021-05-02T22:41:26.355663 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
In [43]:
# Tried something, did not work yet...

#features = create(["GGCGG"])
#for i in incorrect_data:
#    graphic_features = []
#    progress = ProgressBar()
#    for pos, seq in progress(features.instances.search(i.seq)):
#        graphic_features.append(GraphicFeature(start = pos, end= pos + k, strand = +1, color= "#ffd700", label=str(seq + "\n" + "Position : " + str(pos))))
#    record = GraphicRecord(sequence_length = len(i.seq), features=graphic_features)
#    record.plot(figure_width = 15)
#    plt.title("Sequence : " + i.id) 
#    plt.show()
#for c in correct_data:
#    if c.annotations["target"] == "CRB":
#        graphic_features = []
#        progress = ProgressBar()
#        for pos, seq in progress(features.instances.search(i.seq)):
#            graphic_features.append(GraphicFeature(start = pos, end= pos + k, strand = +1, color= "#ffd700", label=str(seq + "\n" + "Position : " + str(pos))))
#        record = GraphicRecord(sequence_length = len(i.seq), features=graphic_features)
#        record.plot(figure_width = 15)
#        plt.title("Sequence : " + c.id) 
#        plt.show()
#        break
#for c in correct_data:
#    if c.annotations["target"] == "OCE":
#        graphic_features = []
#        progress = ProgressBar()
#        for pos, seq in progress(features.instances.search(i.seq)):
#            graphic_features.append(GraphicFeature(start = pos, end= pos + k, strand = +1, color= "#ffd700", label=str(seq + "\n" + "Position : " + str(pos))))
#        record = GraphicRecord(sequence_length = len(i.seq), features=graphic_features)
#        record.plot(figure_width = 15)
#        plt.title("Sequence : " + c.id) 
#        plt.show()
#        break